In this document we present the joint analysis of the PASS1A metabolomics datasets.

Load all datasets

Load the data from the cloud, including: phenotypic data, metabolomic datasets, and metabolomics dictionary.

# load the data and functions to the session

# helper functions for unsupervised data matrix normalization
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/unsupervised_normalization_functions.R")
# basic functions for reading data objects from GCP
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/gcp_functions.R")
# helper functions for data reformat/reshape
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/data_aux_functions.R")
# helper functions for reading and checking the metabolomics data from the cloud
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/metabolomics_data_parsing_functions.R")
# functions for association analysis
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/association_analysis_methods.R")
# functions for classification/regression and cross-validation
source("~/Desktop/repos/motrpac/tools/prediction_ml_tools.R")
library(metafor) # for meta-analysis
library(randomForest) # for classification tests
library(VennDiagram) # for overlaps
require(gridExtra) # for the venn diagrams
library(corrplot) # for overlap/correlation matrices
library(ggplot2)

# Load the dmaqc data
merged_dmaqc_data =  load_from_bucket("merged_dmaqc_data2019-10-15.RData",
    "gs://bic_data_analysis/pass1a/pheno_dmaqc/",T)
merged_dmaqc_data = merged_dmaqc_data[[1]]
rownames(merged_dmaqc_data) = as.character(merged_dmaqc_data$vial_label)
# define the tissue variable
merged_dmaqc_data$tissue = merged_dmaqc_data$sampletypedescription
# define the time to freeze variable
merged_dmaqc_data$time_to_freeze = merged_dmaqc_data$calculated.variables.time_death_to_collect_min + 
  merged_dmaqc_data$calculated.variables.time_collect_to_freeze_min

# blood freeze times
blood_samples = 
  merged_dmaqc_data$specimen.processing.sampletypedescription ==
  "EDTA Plasma"
blood_freeze_time = 
  as.difftime(merged_dmaqc_data$specimen.processing.t_freeze,units = "mins") -
  as.difftime(merged_dmaqc_data$specimen.processing.t_edtaspin,units="mins")
blood_freeze_time = as.numeric(blood_freeze_time)
time_to_freeze = merged_dmaqc_data$time_to_freeze[blood_samples] = 
  blood_freeze_time[blood_samples]

# Load our parsed metabolomics datasets
metabolomics_bucket_obj = load_from_bucket(
  file = "metabolomics_parsed_datasets_pass1a_external_v2_04112020.RData",
  bucket = "gs://bic_data_analysis/pass1a/metabolomics/")
metabolomics_parsed_datasets = metabolomics_bucket_obj$metabolomics_parsed_datasets

# destination for selected figures, tables and RData files
local_destination = "~/Desktop/pass1a_met_files/" # change if you want another dir
try(dir.create(local_destination))
analysis_out_files_dest = paste(local_destination,"/analysis_out_files/",sep="")
try(dir.create(analysis_out_files_dest))

Define the parameters for the analyses below including which covariates to take and what normalization methods to consider.

# Define the variables to be adjusted for:
biospec_cols = c(
  "acute.test.distance",
  "calculated.variables.time_to_freeze",
  "bid" # required for matching datasets
  )
differential_analysis_cols = c(
  "animal.registration.sex",
  "animal.key.timepoint",
  "animal.key.is_control",
  "sample_order"
)
# normalization method pairs to test
# specifies which normalized data entry to take
# first is for targeted datsets, second is for untargeted
tar_vs_untar_norm_pairs = data.frame(
  tar_nrom = c("norm:none;metab:none","norm:none;metab:none"),
  untar_norm = c("norm:none;metab:none","norm:med;metab:log"),
  stringsAsFactors = F  
)
# specify time points for pairwise differential analysis
tp_pairs_for_de = data.frame(
  tp1 = c(0.0,1.0,4.0,4.0,24.0,7.0),
  tp1_is_control = c(F,F,F,F,F,F),
  tp2 = c(0.0,0.0,1.0,7.0,0.0,7.0),
  tp2_is_control = c(T,F,F,T,T,T)
)

Preprocess the data to ease the analyses below

# load the data
metabolomics_processed_datasets = load_from_bucket(
  file="metabolomics_processed_datasets04072020.RData",
  bucket = "gs://bic_data_analysis/pass1a/metabolomics/"
)[[1]]

#' Transform the row names to motrpac names whenever possible.
get_motrpac_dict_rownames<-function(x,row_annot_x){
  if(length(setdiff(rownames(x),row_annot_x[,1]))>0){
    stop("rownames in data and annotation do not match")
  }
  if(length(setdiff(row_annot_x[,1],rownames(x)))>0){
    warning("annotation has additional metabolites")
  }
  if(! "motrpac_comp_name" %in% colnames(row_annot_x)){
    stop("no motrpac_comp_name column in annotation")
  }
  row_annot_x = row_annot_x[rownames(x),]
  y = row_annot_x[,"motrpac_comp_name"]
  inds = !is.na(y) & y!=""
  rnames = rownames(x)
  rnames[inds] = y[inds]
  return(rnames)
}

# Reduce the metadata to the selected columns, transform row names 
# to motrpac ids and add bid-based names
for(currname in names(metabolomics_processed_datasets)){
  d = metabolomics_processed_datasets[[currname]]
  curr_data = d$normalized_data[[1]]
  # organize the metadata
  # some covariates may not appear in the data because they had
  # too many NAs
  covered_covs = intersect(union(biospec_cols,differential_analysis_cols),
          colnames(d$sample_meta))
  curr_meta = d$sample_meta[colnames(curr_data),covered_covs]
  # remove metadata variables with too many NAs
  na_counts = apply(is.na(curr_meta),2,sum)
  curr_meta = curr_meta[,na_counts/nrow(curr_meta) < 0.1]
  motrpac_met_names = get_motrpac_dict_rownames(d$normalized_data[[1]],d$row_annot)
  names(motrpac_met_names) = rownames(d$normalized_data[[1]])
  # add the new fields
  d$covs = curr_meta
  d$motrpac_met_names = motrpac_met_names
  metabolomics_processed_datasets[[currname]] = d
}

# Transform sample id to bid
for(currname in names(metabolomics_processed_datasets)){
  d = metabolomics_processed_datasets[[currname]]
  bids = as.character(d$sample_meta[colnames(d$normalized_data[[1]]),"bid"])
  if(any(bids!=d$sample_meta[,"bid"])){
    print(paste("Warning in dataset",
           currname,"sample order in data and metadata do not match"))
  }
  if(length(unique(bids))==ncol(d$normalized_data[[1]])){
    rownames(d$sample_meta) = bids
    for(j in 1:length(d$normalized_data)){
      colnames(d$normalized_data[[j]]) = bids
    }
  }
  else{
    print(paste("there are ",nrow(d$sample_meta),"samples in dataset",currname))
    print(paste(" => bid column names are not unique in:",currname,", merging"))
    oldmeta = d$sample_meta
    newmeta_inds = c()
    for(bid in unique(bids)){
      newmeta_inds = c(newmeta_inds,which(bids==bid)[1])
    }
    newmeta = oldmeta[newmeta_inds,]
    rownames(newmeta) = as.character(newmeta[,"bid"])
    d$sample_meta = newmeta
    for(j in 1:length(d$normalized_data)){
      d$normalized_data[[j]] = aggregate_repeated_samples(
        d$normalized_data[[j]],bids,na.rm=T
      )
      if(!(all(colnames(d$normalized_data[[j]]) %in%
               rownames(newmeta)))){
        print(paste("!!! Error in merging samples by bid in dataset",currname))
      }
      d$normalized_data[[j]] = d$normalized_data[[j]][
        ,rownames(newmeta)]
    }
    d$motrpac_met_names = d$motrpac_met_names[rownames(d$normalized_data[[1]])]
    print("done")
  }
  metabolomics_processed_datasets[[currname]] = d
  
}
## [1] "there are  156 samples in dataset gastrocnemius,tar,ac_mayo,mayo"
## [1] " => bid column names are not unique in: gastrocnemius,tar,ac_mayo,mayo , merging"
## [1] "done"
## [1] "there are  156 samples in dataset gastrocnemius,tar,amines,mayo"
## [1] " => bid column names are not unique in: gastrocnemius,tar,amines,mayo , merging"
## [1] "done"
## [1] "there are  156 samples in dataset gastrocnemius,tar,cer_mayo,mayo"
## [1] " => bid column names are not unique in: gastrocnemius,tar,cer_mayo,mayo , merging"
## [1] "done"
## [1] "there are  156 samples in dataset gastrocnemius,tar,tca,mayo"
## [1] " => bid column names are not unique in: gastrocnemius,tar,tca,mayo , merging"
## [1] "done"
## [1] "there are  156 samples in dataset liver,tar,ac_mayo,mayo"
## [1] " => bid column names are not unique in: liver,tar,ac_mayo,mayo , merging"
## [1] "done"
## [1] "there are  156 samples in dataset liver,tar,amines,mayo"
## [1] " => bid column names are not unique in: liver,tar,amines,mayo , merging"
## [1] "done"
## [1] "there are  156 samples in dataset liver,tar,cer_mayo,mayo"
## [1] " => bid column names are not unique in: liver,tar,cer_mayo,mayo , merging"
## [1] "done"
## [1] "there are  156 samples in dataset liver,tar,tca,mayo"
## [1] " => bid column names are not unique in: liver,tar,tca,mayo , merging"
## [1] "done"
## [1] "there are  156 samples in dataset white_adipose,tar,ac_mayo,mayo"
## [1] " => bid column names are not unique in: white_adipose,tar,ac_mayo,mayo , merging"
## [1] "done"
## [1] "there are  156 samples in dataset white_adipose,tar,amines,mayo"
## [1] " => bid column names are not unique in: white_adipose,tar,amines,mayo , merging"
## [1] "done"
## [1] "there are  156 samples in dataset white_adipose,tar,cer_mayo,mayo"
## [1] " => bid column names are not unique in: white_adipose,tar,cer_mayo,mayo , merging"
## [1] "done"
## [1] "there are  156 samples in dataset white_adipose,tar,tca,mayo"
## [1] " => bid column names are not unique in: white_adipose,tar,tca,mayo , merging"
## [1] "done"
# Get the named datasets separately - easier to work with later
processed_named_datasets = list()
for(currname in names(metabolomics_processed_datasets)){
  d = metabolomics_processed_datasets[[currname]]
  inds = !is.na(d$row_annot$motrpac_comp_name)
  d$row_annot = d$row_annot[inds,]
  d$motrpac_met_names = d$motrpac_met_names[inds]
  if(!all(d$motrpac_met_names == d$row_annot$motrpac_comp_name)){
      print(paste("Error: row name problems in dataset ",currname))
  }
  for(j in 1:length(d$normalized_data)){
    d$normalized_data[[j]] = d$normalized_data[[j]][inds,]
    if(!all(rownames(d$normalized_data[[j]]) == rownames(d$row_annot))){
      print(paste("Error: row name problems in dataset ",currname))
    }
    rownames(d$normalized_data[[j]]) = d$motrpac_met_names
  }
  processed_named_datasets[[currname]] = d
}
gc()
##            used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells  7801770 416.7   14154432 756.0         NA 12171078 650.1
## Vcells 60886148 464.6   89054561 679.5     102400 68882124 525.6

Compound overlap among the platforms

Compare the overlap between the named metabolites. This is computed for all dataset pairs, but tissue-specific plots are presented for simplicity.

dataset2named_metabolites = lapply(processed_named_datasets,
       function(x)rownames(x$normalized_data[[1]]))
Nd = length(processed_named_datasets)
overlap_matrix = matrix(0,Nd,Nd,dimnames = list(
  names(processed_named_datasets),names(processed_named_datasets)
))
pairwise_shared_metabolites = list()
for(nn1 in names(processed_named_datasets)){
  pairwise_shared_metabolites[[nn1]] = list()
  for(nn2 in names(processed_named_datasets)){
    shared_metabolites = intersect(
      dataset2named_metabolites[[nn1]],dataset2named_metabolites[[nn2]])
    pairwise_shared_metabolites[[nn1]][[nn2]] = shared_metabolites
    overlap_matrix[nn1,nn2] = length(shared_metabolites)
  }
}

dataset2is_targeted = 
  sapply(processed_named_datasets,function(x)x$is_targeted)
dataset2tissue = 
  sapply(processed_named_datasets,function(x)x$tissue)
tissue2overlap_matrix = list()
met_lists_coverage = c()
for(tissue in unique(dataset2tissue)){
  currdatasets = names(dataset2tissue)[dataset2tissue==tissue]
  if(length(currdatasets)<2){next}
  tissue2overlap_matrix[[tissue]] = 
    overlap_matrix[currdatasets,currdatasets]
  all_tissue_mets = unique(unlist(dataset2named_metabolites))
  all_targeted_mets = unique(
    unlist(dataset2named_metabolites[dataset2is_targeted & dataset2tissue==tissue])
  )
  all_untargeted_mets = unique(
    unlist(dataset2named_metabolites[!dataset2is_targeted & dataset2tissue==tissue])
  )
  currv = rep("tar_only",length(all_tissue_mets))
  in_tar = all_tissue_mets %in% all_targeted_mets
  in_untar = all_tissue_mets %in% all_untargeted_mets
  currv[in_untar] = "untar_only"
  currv[in_untar & in_tar] = "both"
  currv = paste(tissue,currv,sep=",")
  met_lists_coverage = rbind(met_lists_coverage,
                             cbind(all_tissue_mets,currv))
}

save(overlap_matrix,tissue2overlap_matrix,met_lists_coverage,
     dataset2named_metabolites, pairwise_shared_metabolites,
    file = paste(analysis_out_files_dest,
                 "plarform_named_metabolite_overlaps.RData",sep=""))

# Show the pairwise overlap and the overall Venn diagram
# comparing targeted to untargeted
for (tissue in names(tissue2overlap_matrix)){
  rownames(tissue2overlap_matrix[[tissue]]) = gsub(
    paste0(tissue,","),"",rownames(tissue2overlap_matrix[[tissue]])
  )
  colnames(tissue2overlap_matrix[[tissue]]) = rownames(tissue2overlap_matrix[[tissue]])
  corrplot(tissue2overlap_matrix[[tissue]],
           is.corr = F,method = "number",number.cex = 0.7,cl.length = 11,
           col= gray.colors(100,rev=T,start=0,end=1,gamma=1))
  m = met_lists_coverage[grepl(tissue,met_lists_coverage[,2]),]
  inter_area = sum(grepl("both",m[,2]))
  tar_area = sum(grepl(",tar_only",m[,2])) + inter_area
  untar_area = sum(grepl(",untar_only",m[,2])) + inter_area
  venng = draw.pairwise.venn(tar_area,untar_area,inter_area,
            c("Targeted","Untargeted"),lty = rep("blank",2), 
            fill = c("pink", "cyan"), alpha = rep(0.5, 2),
            cat.dist = rep(0.01, 2),ind=F)
  grid.arrange(gTree(children=venng),
               top=paste(tissue,"measured named metabolites",sep=", "))
}

Correlations of shared metabolites

# For each normalization option (a pair from the list above) we compute
# all pairwise correlation matrices of the overlapping metabolites
norm_method2correlations = list()
for(norm_ind in 1:nrow(tar_vs_untar_norm_pairs)){
  tar_norm_method = tar_vs_untar_norm_pairs[norm_ind,1]
  untar_norm_method = tar_vs_untar_norm_pairs[norm_ind,2]
  metabolite_corrs = list()
  for(nn1 in names(processed_named_datasets)){
    nn1_tissue = processed_named_datasets[[nn1]]$tissue
    if(grepl(",tar,",nn1)){nn1_norm  = tar_norm_method}
    if(grepl(",untar,",nn1)){nn1_norm  = untar_norm_method}
    metabolite_corrs[[nn1]] = list()
    for(nn2 in names(processed_named_datasets)){
      if(nn2 == nn1){next}
      if(grepl(",tar,",nn2)){nn2_norm  = tar_norm_method}
      if(grepl(",untar,",nn2)){nn2_norm  = untar_norm_method}
      nn2_tissue = processed_named_datasets[[nn2]]$tissue
      if(nn1_tissue!=nn2_tissue){next}
      # get the numeric datasets and their annotation
      x = processed_named_datasets[[nn1]]$normalized_data[[nn1_norm]]
      y = processed_named_datasets[[nn2]]$normalized_data[[nn2_norm]]
      # compute correlations on overlapping bids
      currbids = intersect(colnames(x),colnames(y))
      x = x[,currbids];y=y[,currbids]
    
      # Compute the spearman correlation matrices of the shared metabolites
      shared_metabolites = intersect(rownames(x),rownames(y))
      if(length(shared_metabolites)==0){next}
      if(length(shared_metabolites)>1){
          metabolite_corrs[[nn1]][[nn2]] = cor(t(x[shared_metabolites,]),
                t(y[shared_metabolites,]),method = "spearman")
      }
      else{
          metabolite_corrs[[nn1]][[nn2]] = cor(x[shared_metabolites,],
                y[shared_metabolites,],method = "spearman")
      }
      if(any(is.na(metabolite_corrs[[nn1]][[nn2]]))){
        print("NAs in correlation computation")
        print(paste(nn1,"vs",nn2))
        print(table(is.na(metabolite_corrs[[nn1]][[nn2]])))
        print(sum(is.na(y)))
      }
    }
  }
  pair_name = paste(tar_norm_method,untar_norm_method,sep=";")
  norm_method2correlations[[pair_name]] = metabolite_corrs
}

# Plot some correlations
rep_correlations = c()
tissues = unique(sapply(processed_named_datasets,function(x)x$tissue))
for(pair_name in names(norm_method2correlations)){
  metabolite_corrs =  norm_method2correlations[[pair_name]]
  for(tissue in tissues){
    curr_datasets = names(metabolite_corrs)[
      grepl(tissue,names(metabolite_corrs))
    ]
    between_corrs = c()
    for(dataset in curr_datasets){
      l = metabolite_corrs[[dataset]]
      between_corrs = c(between_corrs,unname(unlist(sapply(l,diag))))
    }
    rep_correlations = rbind(rep_correlations,
          c(pair_name,tissue,mean(between_corrs,na.rm=T),sd(between_corrs,na.rm=T)))
  }
}
rep_correlations = rep_correlations[!is.na(rep_correlations[,3]),]
rep_correlations = data.frame(
  "NormMethod" = rep_correlations[,1],
  "Tissue" = rep_correlations[,2],
  "Mean" = as.numeric(rep_correlations[,3]),
  "SD" = as.numeric(rep_correlations[,4])
)
rep_correlations = rep_correlations[order(rep_correlations$Tissue),]
rep_correlations$NormMethod = gsub(
  "norm:none;metab:none","none",rep_correlations$NormMethod
)
rep_correlations$NormMethod = gsub(
  "norm:med;metab:log","Median",rep_correlations$NormMethod
)
print(
    ggplot(rep_correlations, aes(x=Tissue, y=Mean, fill=NormMethod)) +
      geom_bar(position=position_dodge(), stat="identity", colour='black') +
      geom_errorbar(aes(ymin=Mean-SD, ymax=Mean+SD),na.rm=T, 
                   width=.2,position=position_dodge(.9)) +
      theme(legend.position="top")
)

We examine the average correlation between the platforms (within tissues). Whenever two platforms share more than a single metabolite we plot both the average correlation between the same metabolites and between other metabolites. Adding the average correlation between platforms but with different metabolites is important as it gives some perspective to what a significant correlation is. That is, in many cases below, the average correlation may be greater than expected.

# Next examine the Spearman correlations between platforms
mean_abs<-function(x,...){return(mean(abs(x),...))}
sd_abs<-function(x,...){return(sd(abs(x),...))}
extract_diag_vs_non_diag<-function(corrs,func=mean,...){
  if(length(corrs)==1){
    return(c(same=func(corrs,...),other=NA))
  }
  same = func(diag(corrs),...)
  other = func(
    c(corrs[lower.tri(corrs,diag = F)]),...)
  return(c(same=same,other=other))
}

metabolite_corrs = norm_method2correlations[[1]]
for(tar_dataset in names(metabolite_corrs)){
  if(!grepl(",tar,",tar_dataset)){next}
  l = metabolite_corrs[[tar_dataset]]
  if(length(l)==0){next}
  corr_info = as.data.frame(t(sapply(l, extract_diag_vs_non_diag)))
  corr_sd = as.data.frame(t(sapply(l, extract_diag_vs_non_diag,func=sd)))
  # shorten the row names
  rownames(corr_info) = sapply(rownames(corr_info),
      function(x)paste(strsplit(x,split=",")[[1]][3:4],collapse=","))
  rownames(corr_sd) = rownames(corr_info)
  corr_info$dataset = rownames(corr_info)
  corr_sd$dataset = corr_info$dataset
  corr_info = melt(corr_info)
  corr_sd = melt(corr_sd)
  corr_info$sd = corr_sd$value
  print(
    ggplot(corr_info, aes(x=dataset, y=value, fill=variable)) +
      geom_bar(position=position_dodge(), stat="identity", colour='black') +
      geom_errorbar(aes(ymin=value-sd, ymax=value+sd),na.rm=T, 
                   width=.2,position=position_dodge(.9)) +
    ggtitle(tar_dataset) + xlab("Untargeted dataset") + ylab("Spearman") +
      labs(fill = "Pair type") + 
      theme(legend.position="top",legend.direction = "horizontal")
  )
}

Differential analysis

Compute the summary statistics for each combination of a metabolite, normalization method, and a pair of rat groups.

# For each normalization option (a pair from the list above) we compute
# all pairwise correlation matrices of the overlapping metabolites
single_metabolite_de = c()
for(currname in names(processed_named_datasets)){
  d = processed_named_datasets[[currname]]
  for(norm_method in names(d$normalized_data)){
    x = d$normalized_data[[norm_method]]
    # if data was not log-transformed then scale it
    if(!grepl("log",norm_method)){
      x = t(scale(t(x),center = T,scale = T))
    }
    x_meta = d$sample_meta
    covs = data.frame(
      sex = x_meta$animal.registration.sex,
      sample_order = x_meta$sample_order
    )
    for(j in 1:nrow(tp_pairs_for_de)){
      currres = t(apply(
        x,1,simple_pairwise_differential_abundance,
        tps = x_meta$animal.key.timepoint,
        is_control = x_meta$animal.key.is_control,
        tp1 = tp_pairs_for_de[j,1],tp1_iscontrol = tp_pairs_for_de[j,2],
        tp2 = tp_pairs_for_de[j,3],tp2_iscontrol = tp_pairs_for_de[j,4],
        covs = covs
      ))
      if(!all(rownames(x)==rownames(currres))){
        print("Error in rownames in DE results")
      }
      currres  = data.frame(currres,stringsAsFactors = F,check.names = F)
      currres$dataset = currname
      currres$is_targeted = d$is_targeted
      currres$tissue = d$tissue
      currres$norm = norm_method
      currres$is_log_scale = grepl("log",norm_method)
      currres = cbind(currres,
          data.frame(metabolite=rownames(x),stringsAsFactors = F,check.names = F))
      rownames(currres) = NULL
      single_metabolite_de = rbind(single_metabolite_de,currres)
    }
    # repeat the calculation if data was logged
    if(grepl("log",norm_method)){
      x = 2^x
      x = t(scale(t(x),center = T,scale = T))
      for(j in 1:nrow(tp_pairs_for_de)){
        currres = t(apply(
          x,1,simple_pairwise_differential_abundance,
          tps = x_meta$animal.key.timepoint,
          is_control = x_meta$animal.key.is_control,
          tp1 = tp_pairs_for_de[j,1],tp1_iscontrol = tp_pairs_for_de[j,2],
          tp2 = tp_pairs_for_de[j,3],tp2_iscontrol = tp_pairs_for_de[j,4],
          covs = covs
        ))
        currres  = data.frame(currres)
        currres$dataset = currname
        currres$is_targeted = d$is_targeted
        currres$tissue = d$tissue
        currres$norm = norm_method
        currres$is_log_scale = F
        currres = cbind(currres,
          data.frame(metabolite=rownames(x),stringsAsFactors = F,check.names = F))
        rownames(currres) = NULL
        single_metabolite_de = rbind(single_metabolite_de,currres)
      }
    }
  }
}
single_metabolite_de = single_metabolite_de[!is.na(single_metabolite_de$Pvalue),]
table(single_metabolite_de$norm,single_metabolite_de$is_log_scale)
##                       
##                        FALSE  TRUE
##   norm:med;metab:log   24000 24000
##   norm:none;metab:log   7010  6993
##   norm:none;metab:none 31009     0
sapply(single_metabolite_de,class)
##          tp1   tp1_isctrl          tp2   tp2_isctrl          Est          Std 
##    "numeric"    "numeric"    "numeric"    "numeric"    "numeric"    "numeric" 
##        Tstat       Pvalue      dataset  is_targeted       tissue         norm 
##    "numeric"    "numeric"  "character"    "logical"  "character"  "character" 
## is_log_scale   metabolite 
##    "logical"  "character"

Meta-analysis of DE summary stats

tp_analysis_name = apply(single_metabolite_de[,1:4],1,paste,collapse=",")
norm_method2comparison_results = list()
for(norm_ind in 1:nrow(tar_vs_untar_norm_pairs)){
  tar_norm = tar_vs_untar_norm_pairs[norm_ind,1]
  untar_norm = tar_vs_untar_norm_pairs[norm_ind,2]
  tar_is_log = grepl("log",tar_norm)
  untar_is_log = grepl("log",tar_norm)
  if(!tar_is_log || !untar_is_log){
    untar_is_log = F
    tar_is_log = F
  }
  meta_analysis_stats = list()
  for(tissue in tissues){
    for(tp in unique(tp_analysis_name)){
      curr_subset = single_metabolite_de[
        tp_analysis_name == tp &
          single_metabolite_de$tissue == tissue,]
      curr_subset = curr_subset[
        (curr_subset$is_targeted & curr_subset$norm == tar_norm |
           !curr_subset$is_targeted & curr_subset$norm == untar_norm) &
          (curr_subset$is_targeted & curr_subset$is_log_scale == tar_is_log |
             !curr_subset$is_targeted & curr_subset$is_log_scale == untar_is_log),]
      for(metabolite in unique(curr_subset$metabolite)){
        curr_met_data = curr_subset[curr_subset$metabolite==metabolite,]
        if(is.null(dim(curr_met_data))||nrow(curr_met_data)==1){next}
        if(length(unique(curr_met_data$is_targeted))<2){next}
        curr_met_data$var = curr_met_data$Std^2
        re_model1 = NULL;re_model2=NULL
        re_model_tar = NULL;re_model_untar = NULL
        try({re_model1 = rma.uni(curr_met_data$Est,curr_met_data$var,method="FE")})
        try({re_model2 = rma.mv(curr_met_data$Est,curr_met_data$var,
                           mods=curr_met_data$is_targeted,method="FE")})
        try({re_model_tar = rma.uni(
          curr_met_data[curr_met_data$is_targeted==1,"Est"],
          curr_met_data[curr_met_data$is_targeted==1,"var"],
          method="FE"
        )})
        try({re_model_untar = rma.uni(
          curr_met_data[curr_met_data$is_targeted==0,"Est"],
          curr_met_data[curr_met_data$is_targeted==0,"var"],
          method="FE"
        )})
        meta_analysis_stats[[paste(metabolite,tissue,tp,sep=";")]] = 
          list(curr_met_data=curr_met_data,re_model1=re_model1,
            re_model2 = re_model2,re_model_tar=re_model_tar,
            re_model_untar = re_model_untar,tar_is_log=tar_is_log,
            untar_is_log=untar_is_log)
      }
    }
  }
  norm_method2comparison_results[[paste(tar_norm,untar_norm,sep=";")]] = meta_analysis_stats
}
# Add the results to the single_metabolite_de data frame
single_met_de_meta_anal = c()
for(norm_pair in names(norm_method2comparison_results)){
  l = norm_method2comparison_results[[norm_pair]]
  currres = c()
  for(analysis_name in names(l)){
    arr = strsplit(analysis_name,split=";")[[1]]
    met = arr[1]
    tissue = arr[2]
    tps = strsplit(arr[3],split=",")[[1]]
    tp1 = tps[1]
    tp1_islog = tps[2]
    tp2 = tps[3]
    tp2_islog = tps[4]
    is_log_scale = length(gregexpr("log",norm_pair))==2
    # targeted meta-analysis
    obj = l[[analysis_name]]$re_model_tar
    stats = c(obj$beta,obj$se,obj$zval,obj$pval)
    v = c(tp1,tp1_islog,tp2,tp2_islog,stats,
          "tar_meta_analysis",TRUE,tissue,norm_pair,is_log_scale,met)
    names(v) = colnames(single_metabolite_de)
    currres = rbind(currres,v)
    # untargeted meta-analysis
    obj = l[[analysis_name]]$re_model_untar
    stats = c(obj$beta,obj$se,obj$zval,obj$pval)
    v = c(tp1,tp1_islog,tp2,tp2_islog,stats,
          "untar_meta_analysis",TRUE,tissue,norm_pair,is_log_scale,met)
    names(v) = colnames(single_metabolite_de)
    currres = rbind(currres,v)
    # all meta-analysis
    obj = l[[analysis_name]]$re_model1
    stats = c(obj$beta,obj$se,obj$zval,obj$pval)
    v = c(tp1,tp1_islog,tp2,tp2_islog,stats,
          "meta_analysis",TRUE,tissue,norm_pair,is_log_scale,met)
    names(v) = colnames(single_metabolite_de)
    currres = rbind(currres,v)
  }
  single_met_de_meta_anal = rbind(single_met_de_meta_anal,currres)
}
dim(single_met_de_meta_anal)
## [1] 14688    14
single_met_de_meta_anal = data.frame(single_met_de_meta_anal,stringsAsFactors = F)
# reformat the data frame
for(j in 1:ncol(single_met_de_meta_anal)){
  mode(single_met_de_meta_anal[[j]]) = mode(single_metabolite_de[[j]])
}
single_met_de_meta_anal[1:3,]
single_metabolite_de = rbind(single_metabolite_de,single_met_de_meta_anal)
save(norm_method2comparison_results,
    file = paste(analysis_out_files_dest,
                 "norm_method2comparison_results.RData",sep=""))

Results overlap with a fixed p-value threshold

Get the overlap of results per tissue and normalization method. The overlap matrices show overlap of metabolite sets from each platform. A discovery is a metabolite with \(p<p_thr\) in at least one of the group pairwise comparisons.

get_met_overlap_matrix_from_df<-function(df){
  met_list = list()
  for(dataset in unique(df$dataset)){
    met_list[[dataset]] = unique(df[df$dataset==dataset,"metabolite"])
  }
  n = length(met_list)
  m = matrix(0,n,n,dimnames = list(names(met_list),names(met_list)))
  for(i in 1:n){
    for(j in 1:n){
      m[i,j] = length(intersect(met_list[[i]],met_list[[j]]))
    }
  }
  return(m)
}
pthr = 0.001
tissues = unique(sapply(processed_named_datasets,function(x)x$tissue))
for(norm_ind in 1:nrow(tar_vs_untar_norm_pairs)){
  tar_norm = tar_vs_untar_norm_pairs[norm_ind,1]
  untar_norm = tar_vs_untar_norm_pairs[norm_ind,2]
  tar_is_log = grepl("log",tar_norm)
  untar_is_log = grepl("log",tar_norm)
  if(!tar_is_log || !untar_is_log){
    untar_is_log = F
    tar_is_log = F
  }
  for(tissue in tissues){
    curr_inds = single_metabolite_de$Pvalue < pthr &
      single_metabolite_de$tissue == tissue &
      (single_metabolite_de$is_targeted & single_metabolite_de$norm == tar_norm |
         !single_metabolite_de$is_targeted & single_metabolite_de$norm == untar_norm) &
      (single_metabolite_de$is_targeted & single_metabolite_de$is_log_scale == tar_is_log |
         !single_metabolite_de$is_targeted & single_metabolite_de$is_log_scale == untar_is_log)
    # add the meta-analyses
    curr_inds = curr_inds | 
      ( single_metabolite_de$Pvalue < pthr &
        single_metabolite_de$tissue == tissue &
        grepl("meta_analysis",single_metabolite_de$dataset) &
        single_metabolite_de$norm == paste(tar_norm,untar_norm,sep=";")
      )
    # curr_inds = curr_inds & grepl("Car",single_metabolite_de$metabolite)
    if(sum(curr_inds)==0){next}
    curr_overlaps = get_met_overlap_matrix_from_df(single_metabolite_de[curr_inds,])
    if(length(curr_overlaps) < 2){next}
    rownames(curr_overlaps) = gsub(paste0(tissue,","),"",rownames(curr_overlaps))
    colnames(curr_overlaps) = rownames(curr_overlaps)
    corrplot(curr_overlaps,
           is.corr = F,method = "number",number.cex = 0.8,cl.length = 11,
           col= gray.colors(100,rev=T,start=0,end=1,gamma=1))
    mtext(side=3,cex=1.1,adj=0.05,
         paste(tissue,paste0("tar:",tar_norm),
               paste0("untar:",untar_norm),sep="\n"))
  }
}

Plot selected examples

Here are the results for lactate in plasma and ACs in gastrocnemius.

meta_analysis_stats = norm_method2comparison_results[[1]]
lact_res = meta_analysis_stats[
  grepl("lact",names(meta_analysis_stats),ignore.case = T) &
    grepl("plasma",names(meta_analysis_stats),ignore.case = T)
]
for(lact_example in names(lact_res)){
  curr_labels = gsub("plasma,","",
           meta_analysis_stats[[lact_example]][[1]][,"dataset"])
  forest(meta_analysis_stats[[lact_example]]$re_model1,
       slab = curr_labels,
       main = lact_example,xlab = "Log fc",
       col = "blue",cex = 1.1)
}

Car_res = meta_analysis_stats[
  grepl("Car",names(meta_analysis_stats),ignore.case = T) &
    grepl("gastroc",names(meta_analysis_stats),ignore.case = T)
]
for(car_example in names(Car_res)){
  if(meta_analysis_stats[[car_example]]$re_model_tar$pval > 0.001){next}
  curr_labels = gsub("gastrocnemius,","",
           meta_analysis_stats[[car_example]][[1]][,"dataset"])
  forest(meta_analysis_stats[[car_example]]$re_model1,
       slab = curr_labels,
       main = car_example,xlab = "Log fc",
       col = "blue",cex = 1.1)
}

From the plots above we take the most extreme examples and examine their forest plots.

meta_analysis_stats = norm_method2comparison_results[[1]]
# P-value for the difference between targeted and untargeted
targeted_diff_p =
  sapply(meta_analysis_stats,function(x)x$re_model2$pval[2])

# P-values - targeted vs. untargeted
pvals_tar = sapply(meta_analysis_stats,function(x)x$re_model_tar$pval)
pvals_untar = sapply(meta_analysis_stats,function(x)x$re_model_untar$pval)
pvals_untar = unlist(pvals_untar[sapply(pvals_untar,length)>0])

agree_example = names(sample(which(pvals_tar< 1e-5 & pvals_untar < 1e-5 &
                                     targeted_diff_p > 0.1))[1])
simplify_labels_for_forest<-function(s){
  s = gsub(",untargeted","",s)
  tissue = strsplit(s,split=",")[[1]][1]
  s = gsub(paste(tissue,",",sep=""),"",s)
  return(s)
}
forest(meta_analysis_stats[[agree_example]]$re_model1,
  slab = simplify_labels_for_forest(
    meta_analysis_stats[[agree_example]][[1]][,"dataset"]),
  main = paste(agree_example,"significant in both, tar and untar agree",sep="\n"),
  xlab = "Log fc",col = "blue")

agree_p_disagree_beta = names(sample(which(pvals_tar< 1e-5 & pvals_untar < 1e-5 &
                                     targeted_diff_p < 0.01))[1])
forest(meta_analysis_stats[[agree_p_disagree_beta]]$re_model1,
  slab = simplify_labels_for_forest(
    meta_analysis_stats[[agree_p_disagree_beta]][[1]][,"dataset"]),
  main = paste(agree_p_disagree_beta,
               "significant in both, tar and untar disagree",sep="\n"),
  xlab = "Log fc",col = "blue")

disagree_example1 = names(sample(which(pvals_tar< 1e-5 & pvals_untar >0.1))[1])
forest(meta_analysis_stats[[disagree_example1]]$re_model1,
  slab = simplify_labels_for_forest(
    meta_analysis_stats[[disagree_example1]][[1]][,"dataset"]),
  main = paste(disagree_example1,
               "significant targeted, tar and untar disagree",sep="\n"),
  xlab = "Log fc",col = "blue")

disagree_example2 = names(sample(which(pvals_tar > 0.1 & pvals_untar < 1e-5))[1])
forest(meta_analysis_stats[[disagree_example2]]$re_model1,
  slab = simplify_labels_for_forest(
    meta_analysis_stats[[disagree_example2]][[1]][,"dataset"]),
  main = paste(disagree_example2,
               "significant in untargeted, tar and untar disagree",sep="\n"),
  xlab = "Log fc",col = "blue")

Plot some of Duke’s results - should fit their presentation (see shared drive).

d = metabolomics_processed_datasets$`plasma,tar,ac_duke,duke`
df = data.frame(
  car18_1 = d$normalized_data[[2]]["Car(18:1)",],
  sex = d$sample_meta$animal.registration.sex,
  time = d$sample_meta$animal.key.timepoint
)
# this should fit slide 6 in the Duke's pdf
boxplot(car18_1~time,data=df[df$sex==2,],main = "Car18:1 data in males, plasma")

d = metabolomics_processed_datasets$`gastrocnemius,tar,ka,duke`
df = data.frame(
  ketoleucine = d$normalized_data[[2]]["ketoleucine",],
  sex = d$sample_meta$animal.registration.sex,
  time = d$sample_meta$animal.key.timepoint
)
# this should fit slide 13 in the Duke's pdf
boxplot(ketoleucine~time,data=df[df$sex==1,])

Additional meta-analysis results

pthr = 0.001
tar_vs_untar_correlations = c()
for(tissue in tissues){
  for(pair_name in names(norm_method2comparison_results)){
    meta_analysis_stats = 
        norm_method2comparison_results[[pair_name]]
    meta_analysis_stats  = meta_analysis_stats[
        grepl(tissue,names(meta_analysis_stats))
      ]
    if(is.null(meta_analysis_stats) || length(meta_analysis_stats)==0){next}
  
    # P-value for the difference between targeted and untargeted
    targeted_diff_p = 
    sapply(meta_analysis_stats,function(x)x$re_model2$pval[2])

   # P-values - targeted vs. untargeted
    pvals_tar = sapply(meta_analysis_stats,function(x)x$re_model_tar$pval)
    pvals_untar = sapply(meta_analysis_stats,function(x)x$re_model_untar$pval)
    pvals_untar = unlist(pvals_untar[sapply(pvals_untar,length)>0])
    significant_in = rep("None",length(pvals_untar))
    significant_in[pvals_tar<pthr] = "Targeted"
    significant_in[pvals_untar<pthr] = "Untargeted"
    significant_in[pvals_tar<pthr & pvals_untar<pthr] = "Both"
    significant_diff = targeted_diff_p<pthr
    rho = cor(-log10(pvals_tar),-log10(pvals_untar),method = "pearson")
    # Betas - targeted vs. untargeted
    betas_tar = 
      sapply(meta_analysis_stats,function(x)x$re_model_tar$beta[1,1])
    betas_untar = 
      sapply(meta_analysis_stats,function(x)x$re_model_untar$beta[1,1])
    betas_untar = unlist(betas_untar[sapply(betas_untar,length)>0])
    df = data.frame(
      targeted = betas_tar,
      untargeted = betas_untar,
      significant_in = significant_in,
      significant_diff = significant_diff
    )
    rho_beta = cor(betas_untar,betas_tar,method = "pearson")
    rhop = cor.test(betas_untar,betas_tar,method = "pearson")$p.value
    print(
      ggplot(df, aes(x=targeted, y=untargeted,
                 shape=significant_diff, color=significant_in)) +
      geom_point() + geom_abline(slope=1,intercept = 0) + 
      ggtitle(paste(tissue, pair_name,
                    "effects, rho=:",format(rho_beta,digits=2)))
    )
  
    tar_vs_untar_correlations = rbind(tar_vs_untar_correlations,
                                      c(tissue,pair_name,rho,rho_beta))
   # draw a venn diagram
    inter_area = sum(significant_in=="Both")
    tar_area = sum(significant_in=="Targeted") + inter_area
    untar_area = sum(significant_in=="Untargeted") + inter_area
    subt = paste("Not significant in both:",table(significant_in)["None"])
    
    ss = paste(sample(names(pvals_untar)[significant_in=="Targeted"])[1:10],collapse=";")
    ss = gsub(paste(",",tissue,sep=""),"",ss)
    # ss
    ss = paste(sample(names(pvals_untar)[significant_in=="Untargeted"])[1:10],collapse=";")
    ss = gsub(paste(",",tissue,sep=""),"",ss)
    # ss
    
    venng = draw.pairwise.venn(tar_area,untar_area,inter_area,
            c("Targeted","Untargeted"),lty = rep("blank",2), 
            fill = c("pink", "cyan"), alpha = rep(0.5, 2),
            cat.dist = rep(0.01, 2),ind=F)
    # grid.newpage()
    grid.arrange(gTree(children=venng), 
                 top=paste(tissue,pair_name), bottom=subt)
  }
}